Energy Consumption Forecasting with ML in France¶
Research Questions
To what extent can machine learning models predict daily electricity consumption across French regions, and how do weather and density features improve predictive accuracy?
Which features are most strongly associated with variations in consumption, and how can interpretable ML techniques (e.g., SHAP) be used to understand and communicate these associations?
In [1]:
# IMPORT LIBRARIES
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
In [4]:
#Load the dataset and define the target column, drop NA values from the dataset
cons_filepath = "france_energy_meteo_daily.csv"
energy_source_df = pd.read_csv(cons_filepath)
# Display basic info about the dataset
energy_source_df.info()
# Setting the working directory
project_dir = os.getcwd()
figures_path = os.path.join(project_dir, "output")
os.makedirs(figures_path, exist_ok=True)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 56862 entries, 0 to 56861 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 56862 non-null object 1 region_name 56862 non-null object 2 RR 56862 non-null float64 3 ALTI 56862 non-null float64 4 PMERM 56862 non-null float64 5 INST 56862 non-null float64 6 GLOT 56862 non-null float64 7 DHUMI40 56862 non-null float64 8 NEIGETOTX 56862 non-null float64 9 NEIG 56862 non-null float64 10 BROU 56862 non-null float64 11 ORAG 56853 non-null float64 12 GRESIL 47779 non-null float64 13 GRELE 47937 non-null float64 14 ROSEE 47894 non-null float64 15 VERGLAS 55496 non-null float64 16 SOLNEIGE 47761 non-null float64 17 GELEE 56364 non-null float64 18 FUMEE 56862 non-null float64 19 BRUME 56862 non-null float64 20 ECLAIR 47929 non-null float64 21 TX 56862 non-null float64 22 TNSOL 56862 non-null float64 23 FXY 56862 non-null float64 24 DRR 56862 non-null float64 25 superf 35064 non-null float64 26 population 35064 non-null float64 27 density 35064 non-null float64 28 Région 51864 non-null object 29 Consumption 51864 non-null float64 30 Workday? 56862 non-null int64 dtypes: float64(27), int64(1), object(3) memory usage: 13.4+ MB
In [5]:
# 1. Percentage of null values
null_percentage = np.round(100 - energy_source_df.notnull().mean() * 100)
# 2. Percentage of zeros (only for numeric columns)
zero_percentage = (energy_source_df.select_dtypes(include=[np.number]) == 0).mean() * 100
zero_percentage = np.round(zero_percentage)
# 3. Combine both summaries
summary_df = pd.DataFrame({
"Percentage of Null Values": null_percentage,
"Percentage of Zeros": zero_percentage
})
# 4. Display full summary
print(summary_df.to_string())
Percentage of Null Values Percentage of Zeros ALTI 0.0 0.0 BROU 0.0 36.0 BRUME 0.0 29.0 Consumption 9.0 0.0 DHUMI40 0.0 42.0 DRR 0.0 24.0 Date 0.0 NaN ECLAIR 16.0 76.0 FUMEE 0.0 100.0 FXY 0.0 0.0 GELEE 1.0 91.0 GLOT 0.0 0.0 GRELE 16.0 73.0 GRESIL 16.0 82.0 INST 0.0 1.0 NEIG 0.0 87.0 NEIGETOTX 0.0 65.0 ORAG 0.0 74.0 PMERM 0.0 0.0 ROSEE 16.0 58.0 RR 0.0 2.0 Région 9.0 NaN SOLNEIGE 16.0 82.0 TNSOL 0.0 0.0 TX 0.0 0.0 VERGLAS 2.0 88.0 Workday? 0.0 3.0 density 38.0 0.0 population 38.0 0.0 region_name 0.0 NaN superf 38.0 0.0
In [6]:
# Drop rows with NAs
energy_source_df.dropna(inplace=True)
# Ensure the Date column is in a datetime format
energy_source_df["Date"] = pd.to_datetime(energy_source_df["Date"])
#Remove duplicated column
energy_source_df.drop(columns=["Région"], inplace=True)
# Place consumption data in the correct unit (kWh) and calculate the consumption per capita
energy_source_df["Consumption"] = energy_source_df["Consumption"] * 1000
energy_source_df["Consumption_per_capita"] = energy_source_df["Consumption"] / energy_source_df["population"]
# Add the day of the week
energy_source_df["day_of_week"] = energy_source_df["Date"].dt.dayofweek.astype("category")
In [7]:
# Display basic info about the dataset
energy_source_df.info()
<class 'pandas.core.frame.DataFrame'> Index: 28860 entries, 0 to 37985 Data columns (total 32 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 28860 non-null datetime64[ns] 1 region_name 28860 non-null object 2 RR 28860 non-null float64 3 ALTI 28860 non-null float64 4 PMERM 28860 non-null float64 5 INST 28860 non-null float64 6 GLOT 28860 non-null float64 7 DHUMI40 28860 non-null float64 8 NEIGETOTX 28860 non-null float64 9 NEIG 28860 non-null float64 10 BROU 28860 non-null float64 11 ORAG 28860 non-null float64 12 GRESIL 28860 non-null float64 13 GRELE 28860 non-null float64 14 ROSEE 28860 non-null float64 15 VERGLAS 28860 non-null float64 16 SOLNEIGE 28860 non-null float64 17 GELEE 28860 non-null float64 18 FUMEE 28860 non-null float64 19 BRUME 28860 non-null float64 20 ECLAIR 28860 non-null float64 21 TX 28860 non-null float64 22 TNSOL 28860 non-null float64 23 FXY 28860 non-null float64 24 DRR 28860 non-null float64 25 superf 28860 non-null float64 26 population 28860 non-null float64 27 density 28860 non-null float64 28 Consumption 28860 non-null float64 29 Workday? 28860 non-null int64 30 Consumption_per_capita 28860 non-null float64 31 day_of_week 28860 non-null category dtypes: category(1), datetime64[ns](1), float64(28), int64(1), object(1) memory usage: 7.1+ MB
In [8]:
# Display descriptive statistics for the remaining columns
energy_source_df.describe()
Out[8]:
| Date | RR | ALTI | PMERM | INST | GLOT | DHUMI40 | NEIGETOTX | NEIG | BROU | ... | TX | TNSOL | FXY | DRR | superf | population | density | Consumption | Workday? | Consumption_per_capita | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 28860 | 28860.000000 | 28860.000000 | 28860.000000 | 28860.000000 | 28860.000000 | 28860.000000 | 28860.000000 | 28860.000000 | 28860.000000 | ... | 28860.000000 | 28860.000000 | 28860.000000 | 28860.000000 | 28860.000000 | 2.886000e+04 | 28860.000000 | 2.886000e+04 | 28860.000000 | 28860.000000 |
| mean | 2017-10-05 05:30:42.661122816 | 2.493392 | 324.260537 | 1017.240516 | 336.242005 | 1281.116142 | 65.949851 | 2.677550 | 0.069322 | 0.277349 | ... | 16.743264 | 6.080479 | 6.729508 | 111.696284 | 44493.318204 | 5.552967e+06 | 200.393641 | 2.189303e+08 | 0.970132 | 40.370948 |
| min | 2014-01-01 00:00:00 | 0.000000 | 64.802817 | 975.620000 | 0.000000 | 34.222222 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | -5.798061 | -14.300000 | 1.938889 | 0.000000 | 12064.688640 | 2.576252e+06 | 58.359735 | 5.959200e+07 | 0.000000 | 18.378166 |
| 25% | 2015-10-18 00:00:00 | 0.052136 | 98.726010 | 1012.759286 | 123.000000 | 556.265203 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 11.112390 | 1.728529 | 5.146868 | 0.428571 | 30118.557770 | 3.330478e+06 | 78.061664 | 1.343175e+08 | 1.000000 | 33.248847 |
| 50% | 2017-08-04 12:00:00 | 0.448293 | 162.059902 | 1017.542262 | 304.354167 | 1169.823529 | 1.217708 | 0.000000 | 0.000000 | 0.200000 | ... | 16.426581 | 6.062500 | 6.303510 | 29.649123 | 32363.434265 | 5.127840e+06 | 112.923194 | 1.968565e+08 | 1.000000 | 39.245922 |
| 75% | 2019-09-11 06:00:00 | 3.024985 | 532.332191 | 1022.420357 | 525.848750 | 1936.217568 | 63.659572 | 0.679434 | 0.000000 | 0.458333 | ... | 22.175000 | 10.517269 | 7.874243 | 161.925000 | 70795.544145 | 6.009976e+06 | 158.830373 | 2.828795e+08 | 1.000000 | 46.793050 |
| max | 2021-12-31 00:00:00 | 83.744086 | 916.616505 | 1047.266667 | 912.222222 | 3131.843137 | 930.432099 | 61.356436 | 1.000000 | 1.000000 | ... | 41.707692 | 20.911111 | 20.612766 | 1420.142857 | 85103.128115 | 1.231728e+07 | 1020.936335 | 6.568900e+08 | 1.000000 | 75.535308 |
| std | NaN | 4.519007 | 262.626162 | 8.401296 | 241.866574 | 811.431997 | 128.717643 | 7.472844 | 0.201075 | 0.276446 | ... | 7.338474 | 5.739378 | 2.232564 | 168.771991 | 22721.036332 | 2.722115e+06 | 273.265424 | 1.041739e+08 | 0.170227 | 9.586921 |
8 rows × 30 columns
In [9]:
# Log lin the outcome to Aumentar estabilidad numérica en modelos como SVR o LinearRegression
energy_source_df['LOG_Consumption_per_capita'] = np.log1p(energy_source_df['Consumption_per_capita']) # computes ln(1 + ingreso)
In [10]:
# Set style
sns.set(style="whitegrid")
# Create subplots
fig, axs = plt.subplots(1, 2, figsize=(14, 5))
# Original Income
sns.histplot(energy_source_df['Consumption_per_capita'], bins=50, ax=axs[0], kde=True)
axs[0].set_title("Original Consumption_per_capita Distribution")
axs[0].set_xlabel("Consumption_per_capita")
# Log Income
sns.histplot(energy_source_df['LOG_Consumption_per_capita'], bins=50, ax=axs[1], kde=True, color='orange')
axs[1].set_title("Log(1 + LOG_Consumption_per_capita) Distribution")
axs[1].set_xlabel("LOG_Consumption_per_capita")
plt.tight_layout()
plt.show()
Looking at the distributions:
- Original Consumption_per_capita is already fairly symmetric, bell-shaped, and doesn’t have extreme skewness.
- The log-transformed version is slightly tighter, but not substantially different — and definitely not enough to justify the loss in interpretability for tree-based and deep learning models.
In [11]:
# Target variable
target = "Consumption_per_capita"
# Identify datetime column
#date_features = ["Date"]
# Identify categorical features
#categorical_features = ["region_name"]
# Identify numeric features (excluding target, total consumption, and population, region, date)
numeric_features = [
"RR", "ALTI", "PMERM", "INST", "GLOT", "DHUMI40", "NEIGETOTX", "BROU", "ORAG",
"GRESIL", "GRELE", "ROSEE", "VERGLAS", "GELEE", "FUMEE", "BRUME", "ECLAIR",
"TX", "TNSOL", "FXY", "DRR", "density", "Workday?", "day_of_week"
]
Train-Test Split¶
To evaluate model performance fairly, we divide our dataset into training and testing subsets. This ensures that we train our models on one portion of the data and test their predictive ability on unseen examples.
- Training Set (90%): Used to train the model
- Testing Set (20%): Held out for evaluating out-of-sample performance
In [12]:
# Define the features (X) and target (y)
X = energy_source_df[numeric_features]
y = energy_source_df[target]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Print the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
X_train shape: (23088, 24) X_test shape: (5772, 24) y_train shape: (23088,) y_test shape: (5772,)
Data Exploration¶
In [13]:
# Step 1: Concatenate X_train and y_train
train_data = pd.concat([X_train, y_train], axis=1)
train_data.describe()
Out[13]:
| RR | ALTI | PMERM | INST | GLOT | DHUMI40 | NEIGETOTX | BROU | ORAG | GRESIL | ... | FUMEE | BRUME | ECLAIR | TX | TNSOL | FXY | DRR | density | Workday? | Consumption_per_capita | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 23088.000000 | 23088.000000 | 23088.000000 | 23088.000000 | 23088.000000 | 23088.000000 | 23088.000000 | 23088.000000 | 23088.000000 | 23088.000000 | ... | 23088.000000 | 23088.000000 | 23088.000000 | 23088.000000 | 23088.000000 | 23088.000000 | 23088.000000 | 23088.000000 | 23088.000000 | 23088.000000 |
| mean | 2.488422 | 323.613836 | 1017.240810 | 337.011488 | 1284.290223 | 65.865411 | 2.712122 | 0.276783 | 0.122969 | 0.008952 | ... | 0.000491 | 0.256514 | 0.037379 | 16.760534 | 6.089156 | 6.727186 | 111.885833 | 200.763206 | 0.970591 | 40.354093 |
| std | 4.488603 | 262.551887 | 8.375842 | 242.236362 | 812.030615 | 128.187061 | 7.555000 | 0.276167 | 0.248688 | 0.061797 | ... | 0.007341 | 0.263758 | 0.140028 | 7.353359 | 5.735963 | 2.226330 | 168.423831 | 273.731044 | 0.168954 | 9.563325 |
| min | 0.000000 | 64.802817 | 975.620000 | 0.000000 | 34.222222 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | -5.798061 | -14.300000 | 2.028571 | 0.000000 | 58.359735 | 0.000000 | 18.639506 |
| 25% | 0.052136 | 98.053006 | 1012.730326 | 124.000000 | 555.025000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.043478 | 0.000000 | 11.100595 | 1.731264 | 5.151212 | 0.426692 | 78.061664 | 1.000000 | 33.260815 |
| 50% | 0.451570 | 161.421908 | 1017.551190 | 305.461538 | 1178.472222 | 1.245679 | 0.000000 | 0.200000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.166667 | 0.000000 | 16.452994 | 6.066667 | 6.307456 | 29.788889 | 112.923194 | 1.000000 | 39.206134 |
| 75% | 3.048856 | 531.752215 | 1022.387847 | 526.905769 | 1940.062500 | 64.048377 | 0.684211 | 0.457143 | 0.111111 | 0.000000 | ... | 0.000000 | 0.400000 | 0.000000 | 22.207692 | 10.543750 | 7.869230 | 162.517857 | 158.830373 | 1.000000 | 46.746905 |
| max | 65.095930 | 916.616505 | 1047.266667 | 912.222222 | 3129.000000 | 930.432099 | 61.356436 | 1.000000 | 1.000000 | 1.000000 | ... | 0.375000 | 1.000000 | 1.000000 | 41.707692 | 20.911111 | 20.612766 | 1393.111111 | 1020.936335 | 1.000000 | 75.535308 |
8 rows × 24 columns
In [15]:
# 1) Setup
# Set style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (15, 5)
# 2) Define continuous variables
numeric_features = [
"RR", "ALTI", "PMERM", "INST", "GLOT", "DHUMI40", "NEIGETOTX", "BROU", "ORAG",
"GRESIL", "GRELE", "ROSEE", "VERGLAS", "GELEE", "FUMEE", "BRUME", "ECLAIR",
"TX", "TNSOL", "FXY", "DRR", "density", "Workday?", "day_of_week"
]
# 3) Define categorical variables as the remaining columns
all_columns = train_data.columns.tolist()
#categorical_vars = list(set(all_columns) - set(continuous_vars))
# 4) Force types
#train data
train_data[numeric_features] = train_data[numeric_features].astype(float)
#test data
train_data[numeric_features] = train_data[numeric_features].astype(float)
In [16]:
# 3) Plot Continuous Variables (Histograms)
n_cols = 3
n_rows = -(-len(numeric_features) // n_cols) # Ceiling division
fig, axs = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
axs = axs.flatten()
for i, col in enumerate(numeric_features):
sns.histplot(data=train_data, x=col, ax=axs[i], kde=True, bins=30)
axs[i].set_title(f"Distribution of {col}")
axs[i].tick_params(axis='x', rotation=45)
# Remove unused subplots
for j in range(i + 1, len(axs)):
fig.delaxes(axs[j])
plt.tight_layout()
# Save the figure
histogram_path = os.path.join(figures_path, "numeric_features_histograms.png")
plt.savefig(histogram_path, dpi=300)
# Show the plot
plt.show()
In [18]:
# Plot pairplot to visualize relationships between features
sns.pairplot(train_data[numeric_features])
plt.suptitle("Pairplot of Features")
plt.show()